import warnings
"ignore", message="numpy.ufunc size changed")
warnings.filterwarnings(
import numpy as np
import pandas as pd
"display.max.columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option(
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
"ggplot")
plt.style.use(
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import (LinearRegression,
Ridge,
Lasso)from sklearn.metrics import (r2_score,
mean_absolute_error, mean_squared_error)
Libraries
The Dataset
The dataset is the Appliances Energy Prediction data. The data set is at 10 min for about 4.5 months. The house temperature and humidity conditions were monitored with a ZigBee wireless sensor network. Each wireless node transmitted the temperature and humidity conditions around 3.3 min. Then, the wireless data was averaged for 10 minutes periods. The energy data was logged every 10 minutes with m-bus energy meters. Weather from the nearest airport weather station (Chievres Airport, Belgium) was downloaded from a public data set from Reliable Prognosis (rp5.ru), and merged together with the experimental data sets using the date and time column. Two random variables have been included in the data set for testing the regression models and to filter out non predictive attributes (parameters).
= pd.read_csv("datasets/energydata_complete.csv")
energy energy.head()
date | Appliances | lights | T1 | RH_1 | T2 | RH_2 | T3 | RH_3 | T4 | RH_4 | T5 | RH_5 | T6 | RH_6 | T7 | RH_7 | T8 | RH_8 | T9 | RH_9 | T_out | Press_mm_hg | RH_out | Windspeed | Visibility | Tdewpoint | rv1 | rv2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-01-11 17:00:00 | 60 | 30 | 19.89 | 47.596667 | 19.2 | 44.790000 | 19.79 | 44.730000 | 19.000000 | 45.566667 | 17.166667 | 55.20 | 7.026667 | 84.256667 | 17.200000 | 41.626667 | 18.2 | 48.900000 | 17.033333 | 45.53 | 6.600000 | 733.5 | 92.0 | 7.000000 | 63.000000 | 5.3 | 13.275433 | 13.275433 |
1 | 2016-01-11 17:10:00 | 60 | 30 | 19.89 | 46.693333 | 19.2 | 44.722500 | 19.79 | 44.790000 | 19.000000 | 45.992500 | 17.166667 | 55.20 | 6.833333 | 84.063333 | 17.200000 | 41.560000 | 18.2 | 48.863333 | 17.066667 | 45.56 | 6.483333 | 733.6 | 92.0 | 6.666667 | 59.166667 | 5.2 | 18.606195 | 18.606195 |
2 | 2016-01-11 17:20:00 | 50 | 30 | 19.89 | 46.300000 | 19.2 | 44.626667 | 19.79 | 44.933333 | 18.926667 | 45.890000 | 17.166667 | 55.09 | 6.560000 | 83.156667 | 17.200000 | 41.433333 | 18.2 | 48.730000 | 17.000000 | 45.50 | 6.366667 | 733.7 | 92.0 | 6.333333 | 55.333333 | 5.1 | 28.642668 | 28.642668 |
3 | 2016-01-11 17:30:00 | 50 | 40 | 19.89 | 46.066667 | 19.2 | 44.590000 | 19.79 | 45.000000 | 18.890000 | 45.723333 | 17.166667 | 55.09 | 6.433333 | 83.423333 | 17.133333 | 41.290000 | 18.1 | 48.590000 | 17.000000 | 45.40 | 6.250000 | 733.8 | 92.0 | 6.000000 | 51.500000 | 5.0 | 45.410389 | 45.410389 |
4 | 2016-01-11 17:40:00 | 60 | 40 | 19.89 | 46.333333 | 19.2 | 44.530000 | 19.79 | 45.000000 | 18.890000 | 45.530000 | 17.200000 | 55.09 | 6.366667 | 84.893333 | 17.200000 | 41.230000 | 18.1 | 48.590000 | 17.000000 | 45.40 | 6.133333 | 733.9 | 92.0 | 5.666667 | 47.666667 | 4.9 | 10.084097 | 10.084097 |
Dataset Description
The attribute information can be seen below.
Attribute Information:
Attribute | Description | Units |
---|---|---|
Date | time | year-month-day hour:minute:second |
Appliances | energy use | in Wh |
lights | energy use of light fixtures in the house | in Wh |
T1 | Temperature in kitchen area | in Celsius |
RH_1 | Humidity in kitchen area | in % |
T2 | Temperature in living room area | in Celsius |
RH_2 | Humidity in living room area | in % |
T3 | Temperature in laundry room area | |
RH_3 | Humidity in laundry room area | in % |
T4 | Temperature in office room | in Celsius |
RH_4 | Humidity in office room | in % |
T5 | Temperature in bathroom | in Celsius |
RH_5 | Humidity in bathroom | in % |
T6 | Temperature outside the building (north side) | in Celsius |
RH_6 | Humidity outside the building (north side) | in % |
T7 | Temperature in ironing room | in Celsius |
RH_7 | Humidity in ironing room | in % |
T8 | Temperature in teenager room 2 | in Celsius |
RH_8 | Humidity in teenager room 2 | in % |
T9 | Temperature in parents room | in Celsius |
RH_9 | Humidity in parents room | in % |
To | Temperature outside (from Chievres weather station) | in Celsius |
Pressure | (from Chievres weather station) | in mm Hg |
RH_out | Humidity outside (from Chievres weather station) | in % |
Wind speed | (from Chievres weather station) | in m/s |
Visibility | (from Chievres weather station) | in km |
Tdewpoint | (from Chievres weather station) | Â °C |
rv1 | Random variable 1 | nondimensional |
rv2 | Random variable 2 | nondimensional |
energy.describe()
Appliances | lights | T1 | RH_1 | T2 | RH_2 | T3 | RH_3 | T4 | RH_4 | T5 | RH_5 | T6 | RH_6 | T7 | RH_7 | T8 | RH_8 | T9 | RH_9 | T_out | Press_mm_hg | RH_out | Windspeed | Visibility | Tdewpoint | rv1 | rv2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 |
mean | 97.694958 | 3.801875 | 21.686571 | 40.259739 | 20.341219 | 40.420420 | 22.267611 | 39.242500 | 20.855335 | 39.026904 | 19.592106 | 50.949283 | 7.910939 | 54.609083 | 20.267106 | 35.388200 | 22.029107 | 42.936165 | 19.485828 | 41.552401 | 7.411665 | 755.522602 | 79.750418 | 4.039752 | 38.330834 | 3.760707 | 24.988033 | 24.988033 |
std | 102.524891 | 7.935988 | 1.606066 | 3.979299 | 2.192974 | 4.069813 | 2.006111 | 3.254576 | 2.042884 | 4.341321 | 1.844623 | 9.022034 | 6.090347 | 31.149806 | 2.109993 | 5.114208 | 1.956162 | 5.224361 | 2.014712 | 4.151497 | 5.317409 | 7.399441 | 14.901088 | 2.451221 | 11.794719 | 4.194648 | 14.496634 | 14.496634 |
min | 10.000000 | 0.000000 | 16.790000 | 27.023333 | 16.100000 | 20.463333 | 17.200000 | 28.766667 | 15.100000 | 27.660000 | 15.330000 | 29.815000 | -6.065000 | 1.000000 | 15.390000 | 23.200000 | 16.306667 | 29.600000 | 14.890000 | 29.166667 | -5.000000 | 729.300000 | 24.000000 | 0.000000 | 1.000000 | -6.600000 | 0.005322 | 0.005322 |
25% | 50.000000 | 0.000000 | 20.760000 | 37.333333 | 18.790000 | 37.900000 | 20.790000 | 36.900000 | 19.530000 | 35.530000 | 18.277500 | 45.400000 | 3.626667 | 30.025000 | 18.700000 | 31.500000 | 20.790000 | 39.066667 | 18.000000 | 38.500000 | 3.666667 | 750.933333 | 70.333333 | 2.000000 | 29.000000 | 0.900000 | 12.497889 | 12.497889 |
50% | 60.000000 | 0.000000 | 21.600000 | 39.656667 | 20.000000 | 40.500000 | 22.100000 | 38.530000 | 20.666667 | 38.400000 | 19.390000 | 49.090000 | 7.300000 | 55.290000 | 20.033333 | 34.863333 | 22.100000 | 42.375000 | 19.390000 | 40.900000 | 6.916667 | 756.100000 | 83.666667 | 3.666667 | 40.000000 | 3.433333 | 24.897653 | 24.897653 |
75% | 100.000000 | 0.000000 | 22.600000 | 43.066667 | 21.500000 | 43.260000 | 23.290000 | 41.760000 | 22.100000 | 42.156667 | 20.619643 | 53.663333 | 11.256000 | 83.226667 | 21.600000 | 39.000000 | 23.390000 | 46.536000 | 20.600000 | 44.338095 | 10.408333 | 760.933333 | 91.666667 | 5.500000 | 40.000000 | 6.566667 | 37.583769 | 37.583769 |
max | 1080.000000 | 70.000000 | 26.260000 | 63.360000 | 29.856667 | 56.026667 | 29.236000 | 50.163333 | 26.200000 | 51.090000 | 25.795000 | 96.321667 | 28.290000 | 99.900000 | 26.000000 | 51.400000 | 27.230000 | 58.780000 | 24.500000 | 53.326667 | 26.100000 | 772.300000 | 100.000000 | 14.000000 | 66.000000 | 15.500000 | 49.996530 | 49.996530 |
energy.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 date 19735 non-null object
1 Appliances 19735 non-null int64
2 lights 19735 non-null int64
3 T1 19735 non-null float64
4 RH_1 19735 non-null float64
5 T2 19735 non-null float64
6 RH_2 19735 non-null float64
7 T3 19735 non-null float64
8 RH_3 19735 non-null float64
9 T4 19735 non-null float64
10 RH_4 19735 non-null float64
11 T5 19735 non-null float64
12 RH_5 19735 non-null float64
13 T6 19735 non-null float64
14 RH_6 19735 non-null float64
15 T7 19735 non-null float64
16 RH_7 19735 non-null float64
17 T8 19735 non-null float64
18 RH_8 19735 non-null float64
19 T9 19735 non-null float64
20 RH_9 19735 non-null float64
21 T_out 19735 non-null float64
22 Press_mm_hg 19735 non-null float64
23 RH_out 19735 non-null float64
24 Windspeed 19735 non-null float64
25 Visibility 19735 non-null float64
26 Tdewpoint 19735 non-null float64
27 rv1 19735 non-null float64
28 rv2 19735 non-null float64
dtypes: float64(26), int64(2), object(1)
memory usage: 4.4+ MB
There are no missing values in the dataset.
= MinMaxScaler()
scaler = pd.DataFrame(scaler.fit_transform(energy.drop(columns=['date', 'lights'])),
normalised_df =energy.drop(columns=['date', 'lights']).columns)
columns= normalised_df.drop(columns=['Appliances'])
features_df = normalised_df.Appliances
energy_target = train_test_split(features_df, energy_target, test_size=.3, random_state=42) X_train, X_test, y_train, y_test
From the dataset, fit a linear model on the relationship between the temperature in the living room in Celsius (x = T2) and the temperature outside the building (y = T6).
= LinearRegression()
lin_reg 'T2']], X_train.T6)
lin_reg.fit(X_train[[= lin_reg.predict(X_test[['T2']])
T6_pred print(f'r^2 score: {round(r2_score(X_test.T6, T6_pred), 2)}')
r^2 score: 0.64
print(f'MAE: {round(mean_absolute_error(X_test.T6, T6_pred), 2)}')
MAE: 0.08
print(f'Residual Sum of Squares: {round(np.sum(np.square(X_test.T6 - T6_pred)), 2)}')
Residual Sum of Squares: 274.9
print(f'Root Mean Squared Error: {round(np.sqrt(mean_squared_error(X_test.T6, T6_pred)), 3)}')
Root Mean Squared Error: 0.215
T7 1.0
T1 1.0
T2 1.0
T9 1.0
RH_8 1.0
RH_out 1.0
Tdewpoint 1.0
Visibility 1.0
Windspeed 1.0
Press_mm_hg 1.0
T_out 1.0
RH_9 1.0
T8 1.0
RH_7 1.0
Appliances 1.0
RH_6 1.0
T6 1.0
RH_5 1.0
RH_4 1.0
RH_3 1.0
T3 1.0
RH_2 1.0
RH_1 1.0
rv1 1.0
rv2 1.0
T5 1.0
T4 1.0
dtype: float64
=['date', 'lights']).max().sort_values() energy.drop(columns
Windspeed 14.000000
Tdewpoint 15.500000
T9 24.500000
T5 25.795000
T7 26.000000
T_out 26.100000
T4 26.200000
T1 26.260000
T8 27.230000
T6 28.290000
T3 29.236000
T2 29.856667
rv1 49.996530
rv2 49.996530
RH_3 50.163333
RH_4 51.090000
RH_7 51.400000
RH_9 53.326667
RH_2 56.026667
RH_8 58.780000
RH_1 63.360000
Visibility 66.000000
RH_5 96.321667
RH_6 99.900000
RH_out 100.000000
Press_mm_hg 772.300000
Appliances 1080.000000
dtype: float64
=['date', 'lights']).min().sort_values() energy.drop(columns
Tdewpoint -6.600000
T6 -6.065000
T_out -5.000000
Windspeed 0.000000
rv2 0.005322
rv1 0.005322
Visibility 1.000000
RH_6 1.000000
Appliances 10.000000
T9 14.890000
T4 15.100000
T5 15.330000
T7 15.390000
T2 16.100000
T8 16.306667
T1 16.790000
T3 17.200000
RH_2 20.463333
RH_7 23.200000
RH_out 24.000000
RH_1 27.023333
RH_4 27.660000
RH_3 28.766667
RH_9 29.166667
RH_8 29.600000
RH_5 29.815000
Press_mm_hg 729.300000
dtype: float64
def get_weights_df(model, feat, col_name):
#this function returns the weight of every feature
= pd.Series(model.coef_, feat.columns).sort_values()
weights = pd.DataFrame(weights).reset_index()
weights_df = ['Features', col_name]
weights_df.columns round(3)
weights_df[col_name].return weights_df
= Ridge(alpha=0.4)
ridge_reg ridge_reg.fit(X_train, y_train)
Ridge(alpha=0.4)
= Lasso(alpha=0.001)
lasso_reg lasso_reg.fit(X_train, y_train)
Lasso(alpha=0.001)
= LinearRegression()
model model.fit(X_train, y_train)
LinearRegression()
= get_weights_df(model, X_train, 'Linear_Model_Weight')
linear_model_weights = get_weights_df(ridge_reg, X_train, 'Ridge_Weight')
ridge_weights_df = get_weights_df(lasso_reg, X_train, 'Lasso_weight')
lasso_weights_df
= pd.merge(linear_model_weights, ridge_weights_df, on='Features')
final_weights = pd.merge(final_weights, lasso_weights_df, on='Features') final_weights
"Linear_Model_Weight", ascending=False) final_weights.sort_values(
Features | Linear_Model_Weight | Ridge_Weight | Lasso_weight | |
---|---|---|---|---|
25 | RH_1 | 0.553547 | 0.519525 | 0.017880 |
24 | T3 | 0.290627 | 0.288087 | 0.000000 |
23 | T6 | 0.236425 | 0.217292 | 0.000000 |
22 | Tdewpoint | 0.117758 | 0.083128 | 0.000000 |
21 | T8 | 0.101995 | 0.101028 | 0.000000 |
20 | RH_3 | 0.096048 | 0.095135 | 0.000000 |
19 | RH_6 | 0.038049 | 0.035519 | -0.000000 |
18 | Windspeed | 0.029183 | 0.030268 | 0.002912 |
17 | T4 | 0.028981 | 0.027384 | -0.000000 |
16 | RH_4 | 0.026386 | 0.024579 | 0.000000 |
15 | RH_5 | 0.016006 | 0.016152 | 0.000000 |
14 | Visibility | 0.012307 | 0.012076 | 0.000000 |
13 | T7 | 0.010319 | 0.010098 | -0.000000 |
12 | Press_mm_hg | 0.006839 | 0.006584 | -0.000000 |
11 | rv2 | 0.000770 | 0.000748 | -0.000000 |
10 | rv1 | 0.000770 | 0.000748 | -0.000000 |
9 | T1 | -0.003281 | -0.018406 | 0.000000 |
8 | T5 | -0.015657 | -0.019853 | -0.000000 |
7 | RH_9 | -0.039800 | -0.041367 | -0.000000 |
6 | RH_7 | -0.044614 | -0.045977 | -0.000000 |
5 | RH_out | -0.077671 | -0.054724 | -0.049557 |
4 | RH_8 | -0.157595 | -0.156830 | -0.000110 |
3 | T9 | -0.189941 | -0.188916 | -0.000000 |
2 | T2 | -0.236178 | -0.201397 | 0.000000 |
1 | T_out | -0.321860 | -0.262172 | 0.000000 |
0 | RH_2 | -0.456698 | -0.411071 | -0.000000 |
= model.predict(X_test)
y_pred_lg = ridge_reg.predict(X_test)
y_pred_r = lasso_reg.predict(X_test) y_pred_l
print(f'Root Mean Squared Error: {round(np.sqrt(mean_squared_error(y_test, y_pred_r)), 3)}')
Root Mean Squared Error: 0.088
print(f'Root Mean Squared Error: {round(np.sqrt(mean_squared_error(y_test, y_pred_l)), 3)}')
Root Mean Squared Error: 0.094